From: C. Scott Ananian Date: Wed, 22 Jul 2015 20:07:27 +0000 (-0500) Subject: T106578: Update Sanitizer to match legal HTML5 character entities. X-Git-Tag: 1.31.0-rc.0~10356 X-Git-Url: http://git.cyclocoop.org/%7D%7Cconcat%7B?a=commitdiff_plain;h=bc75784cbb6d75a244c1d28dd99ac34baf930fdb;p=lhc%2Fweb%2Fwiklou.git T106578: Update Sanitizer to match legal HTML5 character entities. Invalid HTML5 character entities become instances of UTF8_REPLACEMENT, so we also ensure that checkCSS notices this and emits the proper human-friendly sanitization notice. Change-Id: I76cef7c772b1e3eba0af8dab6403e9100beab03a --- diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 30981c368f..e8f06c46b1 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -966,7 +966,8 @@ class Sanitizer { $value = self::normalizeCss( $value ); // Reject problematic keywords and control characters - if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { + if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || + strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { return '/* invalid control char */'; } elseif ( preg_match( '! expression @@ -1399,15 +1400,19 @@ class Sanitizer { } /** - * Returns true if a given Unicode codepoint is a valid character in XML. + * Returns true if a given Unicode codepoint is a valid character in + * both HTML5 and XML. * @param int $codepoint * @return bool */ private static function validateCodepoint( $codepoint ) { + # U+000C is valid in HTML5 but not allowed in XML. + # U+000D is valid in XML but not allowed in HTML5. + # U+007F - U+009F are disallowed in HTML5 (control characters). return $codepoint == 0x09 || $codepoint == 0x0a - || $codepoint == 0x0d - || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) + || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) + || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); } diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index f6ca577186..266b2b01ac 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -15831,7 +15831,7 @@ CSS line continuation 2 !! wikitext
!! html -
+
!! end @@ -18164,6 +18164,38 @@ parsoid=wt2html,wt2wt,html2html

îî

!! end +# See: http://www.w3.org/TR/html5/syntax.html#character-references +# Note that U+000C (form feed) is not a valid XML character, so +# it is banned even though allowed in HTML5. +!! test +Illegal character references (T106578) +!! wikitext +; Null: � +; FF: +; CR: +; Control (low):  +; Control (high):  Ÿ +; Surrogate: �� +; This is an okay astral character: 💩 +!! html+tidy +
+
Null
+
&#00;
+
FF
+
&#xC;
+
CR
+
&#xD;
+
Control (low)
+
&#8;
+
Control (high)
+
&#x7F; &#x9F;
+
Surrogate
+
&#xD83D;&#xDCA9;
+
This is an okay astral character
+
💩
+
+!! end + !! test __FORCETOC__ override !! wikitext